home
***
CD-ROM
|
disk
|
FTP
|
other
***
search
/
Garbo
/
Garbo.cdr
/
mac
/
hypercrd
/
hc1_2_x
/
fretext1.sit
/
Free Text Source Code
/
indexFile 0.72.c
< prev
next >
Wrap
Text File
|
1990-01-28
|
41KB
|
1,668 lines
/* new indexFile XFCN for Free Text projects
* copyright 1990 - Mark Zimmermann
*
* call the XFCN as 'indexFile()' ... it returns with nothing
* if all goes well, and an error message (if possible) otherwise...
*
* For further information, write:
* Mark ^Zimmermann
* P.O.Box 8310
* Silver Spring, MD 20907
* USA
*
* Be sure to enclose a STAMPED, SELF-ADDRESSED ENVELOPE if you want
* to receive a timely reply!
*
* Electronic addresses:
* science@nems.dt.navy.mil
* [75066,2044] CompuServe
*/
/* header files to include...
*/
#include <MacTypes.h>
#include <EventMgr.h>
#include <OSUtil.h>
#include <FileMgr.h>
#include <WindowMgr.h>
#include <StdFilePkg.h>
#include <HyperXCmd.h>
#include <SetUpA4.h>
/* sort on 28 characters -- long enough to avoid truncation of most real
* words, and short enough to save some space in the *.k files
*/
#define KEY_LENGTH 28
/* my standard structure for the index_keys file
*/
typedef struct
{
char kkey[KEY_LENGTH];
long ccount;
} KEY_REC;
/* my standard structure for simple I/O buffers ...
*/
typedef struct
{
char *zbufbase;
char *zbufp;
long zbufcounter;
long zbufsize;
int zbuffilep;
int zbufitemsize;
} ZBUF;
/* merge this many files at each stage of the merging operation for index
* building ... 2 means a binary merge, etc. ... one needs to have at least
* 5 + 2 * NMERGE I/O buffers around: for each of NMERGE files, there is
* a *.k keys file and a *.p pointers file; plus there must be a single
* output *.k and a single output *.p file; plus there is the need for stdin,
* stdout, and stderr to be open as well. Thus, I have found that a 4-way
* merge (NMERGE = 4) works pretty nicely....
*/
#define NMERGE 4
#ifndef TRUE
#define TRUE 1
#endif
#ifndef FALSE
#define FALSE 0
#endif
#ifndef NULL
#define NULL 0
#endif
#ifndef EOF
#define EOF (-1)
#endif
/* --------------function prototypes ----------------------- */
pascal void main (XCmdBlockPtr paramPtr);
int zGetFile (Str255 *fileNamePtr, int *volRefNumPtr);
void returnErrorMsg (XCmdBlockPtr paramPtr, char *msg);
void give_msg (char *msg);
void beepWait (void);
long atol (char *s);
int putNum (char *ans, long num);
int strlen (char *s);
char *strcpy (char *s1, char *s2);
void create_zbuffer (int n, long bufsize, int buffile, int bufitemsize,
ZBUF zbuffer[]);
char *next_input_item (int n, ZBUF zbuffer[]);
void load_zbuffer (int n, ZBUF zbuffer[]);
char *next_output_item (int n, ZBUF zbuffer[]);
void flush_zbuffer (int n, ZBUF zbuffer[]);
int build_indices (long zbufsiz, int doc_file, int vRef0, int pass_number);
void init_filter_table (void);
char *make_buf (long bufsiz);
long load_doc_buffer (char *doc, long doc_bufsiz, char **ptr, int doc_file);
int zgetc (int refNum);
char *strncpy(char *s1, char *s2, int n);
void nway_merge_kpfiles (int ink[], int inp[], int outk, int outp, int n,
long zbufsiz);
void copy_ptr_recs (int inzbuf, long count, int outzbuf, ZBUF zbuffer[]);
void copy_key_rec (char *kkey, long ccount, int outzbuf, ZBUF zbuffer[]);
int merge_not_finished (KEY_REC *kr[], int n);
int smallest_key (KEY_REC *kr[], int n);
int merge_indices (long zbufsiz, int doc_file, int vRef0,
int file_number, int generation_number, Str255 doc_filename);
long write_sorted_files (char *doc, char **ptr, long nwords, long offset,
long zbufsiz, int pass_number, int vRef0);
int is_new_word (char *w0, char *w1);
void write_new_key (char *p, char *kp);
long file_size (int refnum);
int open_inkfile (int file_num, int vRef0, int generation_number);
int open_inpfile (int file_num, int vRef0, int generation_number);
void fix_oddball_file_name (int vRef0, int generation_number,
int file_number);
void fix_final_file_names (Str255 doc_filename, int vRef0,
int generation_number);
int open_outkfile (int vRef0, int generation_number, int file_number);
int open_outpfile (int vRef0, int generation_number, int file_number);
void remove_used_infiles (int n, int vRef0, int generation_number,
int file_number);
void close_infiles (int ink[], int inp[], int n);
int open_kfile (int vRef0, int pass_number);
int open_pfile (int vRef0, int pass_number);
long zftell (int refnum);
void zqsort (char **first, char **last);
int compare_ptrs (char **p1, char **p2);
int zstrcmp (char *s1, char *s2);
/* standard table to filter lower-case characters to capitals (if possible)
* for sorting into alphabetical order ... my best estimates of what to
* do for foreign characters in the Apple character set ... consider
* using international utilities someday to do the sorting better...
*/
char filter_table[256] =
{ 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0,
'0', '1', '2', '3', '4', '5', '6', '7',
'8', '9', 0, 0, 0, 0, 0, 0,
0, 'A', 'B', 'C', 'D', 'E', 'F', 'G',
'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O',
'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W',
'X', 'Y', 'Z', 0, 0, 0, 0, 0,
0, 'A', 'B', 'C', 'D', 'E', 'F', 'G',
'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O',
'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W',
'X', 'Y', 'Z', 0, 0, 0, 0, 0,
0x80,0x81,0x82,0x83,0x84,0x85,0x86,0x87,
0xCB,0x89,0x80,0xCC,0x81,0x82,0x83,0x8F,
0x90,0x91,0x92,0x93,0x94,0x95,0x84,0x97,
0x98,0x99,0x85,0xCD,0x9C,0x9D,0x9E,0x86,
0, 0, 0, 0, 0, 0, 0,0xA7,
0, 0, 0, 0, 0, 0,0xAE,0xAF,
0, 0, 0, 0, 0,0xB5,0xC6,0xB7,
0xB8,0xB8, 0,0xBB,0xBC,0xBD,0xAE,0xAF,
0, 0, 0, 0, 0, 0,0xC6, 0,
0, 0, 0,0xCB,0xCC,0xCD,0xCE,0xCE,
0, 0, 0, 0, 0, 0, 0, 0,
0xD8, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0 } ;
/* ----------------------main program------------------------- */
pascal void main (paramPtr)
XCmdBlockPtr paramPtr;
{
Str255 doc_filename;
char info[128];
Ptr testzbufsiz;
WindowRecord w_record;
WindowPtr info_window;
Rect b_rect;
Handle answer;
int doc_file, vRef0, pass_number, generation_number,
prev_gen_number, file_number, i;
long zbufsiz, doc_filesize, grow;
RememberA0();
SetUpA4();
if (paramPtr->paramCount != 0)
{
returnErrorMsg (paramPtr,
"{Sorry, wrong number of parameters in indexFile call!}");
RestoreA4();
return;
}
/* see how much memory we can get for our zbuffers ... */
MaxApplZone();
zbufsiz = MaxMem (&grow) - 32768;
zbufsiz /= (2 * NMERGE + 2);
zbufsiz = zbufsiz - zbufsiz % (sizeof(KEY_REC) * sizeof(long));
if (zbufsiz < sizeof(KEY_REC) * sizeof(long) ||
(testzbufsiz = NewPtr (zbufsiz * (2 * NMERGE + 2))) == NULL)
{
returnErrorMsg (paramPtr,
"{Sorry, not enough free memory to build an index!}");
RestoreA4();
return;
}
DisposPtr (testzbufsiz);
/* set up a window to give user feedback during indexing */
b_rect.top = 30;
b_rect.left = 12;
b_rect.bottom = 332;
b_rect.right = 500;
info_window = NewWindow (&w_record, &b_rect, "\p", (Boolean)1,
dBoxProc, (WindowPtr)-1, (Boolean)0, (long)0);
ShowWindow (info_window);
SetPort (info_window);
TextFont (0);
give_msg ("\pGreetings! Please choose a file to be indexed...");
if (! zGetFile (&doc_filename, &vRef0))
{
CloseWindow (info_window);
RestoreA4();
return;
}
i = FSOpen (doc_filename, vRef0, &doc_file);
if (i != noErr && i != opWrErr)
{
give_msg ("\pSorry, error opening the file! Click mouse to exit...");
beepWait ();
CloseWindow (info_window);
RestoreA4();
return;
}
give_msg ("\pBeginning to build index subfiles ╤ please be patient...");
pass_number = 0;
while (build_indices (zbufsiz, doc_file, vRef0, pass_number))
++pass_number;
if (pass_number == 0)
give_msg ("\pNo data to index!");
else
{
if (pass_number > 1)
give_msg ("\pBeginning merge generation #1 ╤ please be patient...");
file_number = 0;
generation_number = 0;
do
{
prev_gen_number = generation_number;
generation_number = merge_indices (zbufsiz, doc_file, vRef0,
file_number, generation_number, doc_filename);
if (generation_number == prev_gen_number)
file_number += NMERGE;
else
file_number = 0;
}
while (generation_number >= 0);
give_msg ("\pIndexing completed!");
}
FSClose (doc_file);
give_msg ("\pClick mouse to exit...");
beepWait ();
CloseWindow (info_window);
RestoreA4();
return;
}
/* this routine, adapted from the Lightspeed C 'MiniEdit' example, does
* the standard files dialog routine to get the name of the file to be indexed
* for the database; other file names are gotten from that by putting
* a '.k' or a '.p' at the end. This routine returns 1 if all is well,
* or 0 if the user cancels out...
*
* note that the file type is not used here, though a parameter has to be
* passed (I think) to SFGetFile()....
*/
int zGetFile (fileNamePtr, volRefNumPtr)
Str255 *fileNamePtr;
int *volRefNumPtr;
{
SFTypeList myFileTypes;
Point SFGwhere;
SFReply myReply;
register int i;
SFGwhere.v = 90;
SFGwhere.h = 82;
myFileTypes[0] = 'TEXT'; /* this isn't actually used */
SFGetFile (SFGwhere, "\p", 0L, -1, myFileTypes, 0L, &myReply);
if (myReply.good)
{
for (i = *myReply.fName; i >= 0; --i)
(*fileNamePtr)[i] = myReply.fName[i];
*volRefNumPtr = myReply.vRefNum;
return (1);
}
else
return (0);
}
/* function to set the return value of the XFCN to a chosen error msg;
* if there isn't enough free memory to give us a Handle to the msg,
* beep a bit more and then return!
*/
void returnErrorMsg (paramPtr, msg)
XCmdBlockPtr paramPtr;
char *msg;
{
Handle answer;
int msgLength;
SysBeep (10);
msgLength = strlen (msg);
if ((answer = NewHandle (1 + msgLength)) == NULL)
{
SysBeep (10);
SysBeep (10);
return;
}
strcpy (*answer, msg);
paramPtr->returnValue = answer;
return;
}
/* tiny routine to put a message into my message window.... takes in a
* PASCAL type string and displays it decently, scrolling text up....
*/
void give_msg (msg)
char *msg;
{
Rect b_rect;
RgnHandle theRgn;
b_rect.top = b_rect.left = 0;
b_rect.bottom = 302;
b_rect.right = 488;
theRgn = NewRgn ();
ScrollRect (&b_rect, 0, -15, theRgn);
DisposeRgn (theRgn);
MoveTo (4, 295);
DrawString (msg);
return;
}
/* function to beep and then wait for a mouse click before returning...
* delay for 2 ticks to debounce the mouse button a bit .... flush
* events to avoid spurious clicks hanging around....
*/
void beepWait ()
{
long tickcount;
SysBeep (10);
while (! Button())
;
Delay (2L, &tickcount);
while (Button())
;
Delay (2L, &tickcount);
FlushEvents (everyEvent, 0);
return;
}
/* function to convert alphabetic string to a long integer ...
*/
long atol (s)
register char *s;
{
register char signflag = 0;
register long r = 0;
while ((*s == ' '))
s++;
if (*s == '-')
{
signflag = 1;
s++;
}
else if (*s == '+')
s++;
while (*s >= '0' && *s <= '9')
r = r * 10 + (*s++ - '0');
return (signflag ? -r : r);
}
/* function to convert a number into a string and put it into the chosen
* target place ... returns the number of characters stored ...
* based on K&R p. 60 example of itoa()....
*
* ---> note, there is NO TERMINAL '\0' in the string that is built!!!! <---
*/
int putNum (ans, num)
register char *ans;
register long num;
{
register int i, j;
int s, result;
i = 0;
s = 1;
if (num < 0)
{
num = -num;
s = -1;
}
do
ans[i++] = num % 10 + '0';
while ((num /= 10) > 0);
if (s < 0)
ans[i++] = '-';
result = i;
for (--i, j = 0; j < i; ++j, --i)
{
s = ans[i];
ans[i] = ans[j];
ans[j] = s;
}
return (result);
}
/* function to determine the length of a string ... standard thing,
* adapted from K&R p.98 ....
*/
int strlen (s)
register char *s;
{
char *s0 = s;
while (*s++)
;
return (s - s0 - 1);
}
/* function to copy a string from one place to another, in a rather
* obvious fashion ... adapted from K&R p.101 ....
*/
char *strcpy (s1, s2)
register char *s1, *s2;
{
char *s = s1;
while (*s1++ = *s2++)
;
return (s);
}
/* -------------------buffer-related functions-----------------*/
/* function to create a zbuffer and set its parameters appropriately
*/
void create_zbuffer (n, bufsize, buffile, bufitemsize, zbuffer)
int n, bufitemsize;
long bufsize;
int buffile;
ZBUF zbuffer[];
{
zbuffer[n].zbufbase = make_buf (bufsize);
zbuffer[n].zbufp = zbuffer[n].zbufbase;
zbuffer[n].zbufcounter = 0;
zbuffer[n].zbufsize = bufsize;
zbuffer[n].zbuffilep = buffile;
zbuffer[n].zbufitemsize = bufitemsize;
}
/* function to return a pointer to the next item in a chosen input
* buffer 'n'; it reloads the buffer if necessary ... returns NULL
* pointer when there's nothing left in the file ...
*/
char *next_input_item (n, zbuffer)
int n;
ZBUF zbuffer[];
{
char *result;
if (zbuffer[n].zbufcounter == 0)
load_zbuffer (n, zbuffer);
zbuffer[n].zbufcounter -= zbuffer[n].zbufitemsize;
if (zbuffer[n].zbufcounter >= 0)
{
result = zbuffer[n].zbufp;
zbuffer[n].zbufp += zbuffer[n].zbufitemsize;
return (result);
}
else
return (NULL);
}
/* function to load the nth zbuffer appropriately ... it resets the buffer
* pointers, etc. ...
*/
void load_zbuffer (n, zbuffer)
int n;
ZBUF zbuffer[];
{
long nread;
OSErr err;
nread = zbuffer[n].zbufsize;
err = FSRead (zbuffer[n].zbuffilep, &nread, zbuffer[n].zbufbase);
zbuffer[n].zbufp = zbuffer[n].zbufbase;
zbuffer[n].zbufcounter = nread;
if (err != noErr && err != eofErr)
{
give_msg ("\pSorry, fatal error loading buffer -- click mouse to exit, then reboot!");
beepWait ();
RestoreA4();
ExitToShell ();
}
return;
}
/* function to return a pointer to the right place to store the next
* output item for the nth zbuffer ... when the buffer becomes full,
* it flushes it to disk, resets pointers, etc.
*/
char *next_output_item (n, zbuffer)
int n;
ZBUF zbuffer[];
{
char *result;
if (zbuffer[n].zbufcounter == zbuffer[n].zbufsize)
flush_zbuffer (n, zbuffer);
result = zbuffer[n].zbufp;
zbuffer[n].zbufp += zbuffer[n].zbufitemsize;
zbuffer[n].zbufcounter += zbuffer[n].zbufitemsize;
return (result);
}
/* function to flush out a zbuffer to the disk ...
*/
void flush_zbuffer (n, zbuffer)
int n;
ZBUF zbuffer[];
{
long chars_written;
if (zbuffer[n].zbufcounter == 0)
return;
chars_written = zbuffer[n].zbufcounter;
if (FSWrite (zbuffer[n].zbuffilep, &chars_written,
zbuffer[n].zbufbase) != noErr || chars_written == 0)
{
give_msg ("\pSorry, fatal error flushing buffer -- click mouse to exit, then reboot!");
beepWait ();
RestoreA4();
ExitToShell ();
}
zbuffer[n].zbufp = zbuffer[n].zbufbase;
zbuffer[n].zbufcounter = 0;
}
/* ------------------index-building functions------------------*/
int build_indices (zbufsiz, doc_file, vRef0, pass_number)
long zbufsiz;
int doc_file, vRef0, pass_number;
{
long doc_bufsiz, offset, nwords, ndistinctwords;
char *doc, **ptr, info[128];
int i;
doc_bufsiz = 2 * NMERGE * zbufsiz / 3 - KEY_LENGTH;
doc = make_buf (doc_bufsiz + KEY_LENGTH);
ptr = (char **)make_buf (doc_bufsiz * 2);
offset = zftell (doc_file);
nwords = load_doc_buffer (doc, doc_bufsiz, ptr, doc_file);
if (nwords == 0)
{
DisposPtr (doc);
DisposPtr (ptr);
return (FALSE);
}
zqsort (ptr, ptr + nwords);
ndistinctwords = write_sorted_files (doc, ptr, nwords, offset, zbufsiz,
pass_number, vRef0);
DisposPtr (doc);
DisposPtr (ptr);
strncpy (info + 1, "Index subfile #", 15);
i = 16 + putNum (info + 16, pass_number + 1);
strncpy (info + i, ": ", 3);
i += 3;
i += putNum (info + i, nwords);
strncpy (info + i, " total words, ", 14);
i += 14;
i += putNum (info + i, ndistinctwords);
strncpy (info + i, " distinct words.", 16);
i += 16;
info[0] = i - 1;
give_msg (info);
return (TRUE);
}
/* ---------functions to load text from document file filter it-------- */
/* function to create a buffer for me to use...
*/
char *make_buf (bufsiz)
long bufsiz;
{
char *buf;
buf = NewPtr (bufsiz);
if (buf == 0)
{
give_msg ("\pSorry, fatal error creating buffer -- click mouse to exit, then reboot!");
beepWait ();
RestoreA4();
ExitToShell ();
}
return (buf);
}
/* function to load the document buffer ... bring in doc_bufsiz
* characters, and then enough more to finish out the final word,
* followed by a terminal delimiter .... as the characters are scanned
* filter them appropriately (change delimiters to '\0's) and
* build the pointer array in memory to the first character of each
* word ... return the total number of words that were
* read in to the buffer (zero if we're finished with the file)
*
* ... note that one must be sure to pull in and throw away
* any excess characters beyond KEY_LENGTH in the final word, so that
* the remaining fragment doesn't show up as the first "word" in the
* next chunk of the file....
*
* For maximum speed and efficiency, use a hybrid approach: do a
* big buffer-sized chunk of the file first, then take one character at
* a time to finish off....
*/
long load_doc_buffer (doc, doc_bufsiz, ptr, doc_file)
register char *doc, **ptr;
long doc_bufsiz;
int doc_file;
{
int i, err;
register int c, in_a_word = FALSE;
char **ptr0, *end_doc_buf;
ptr0 = ptr;
err = FSRead (doc_file, &doc_bufsiz, doc);
if (err != noErr && err != eofErr)
{
give_msg ("\pSorry, fatal error reading text -- click mouse to exit, then reboot!");
beepWait ();
RestoreA4();
ExitToShell ();
}
end_doc_buf = doc + doc_bufsiz;
while (doc < end_doc_buf)
{
c = filter_table[*(unsigned char *)doc];
if (c == '\0')
in_a_word = FALSE;
else if (! in_a_word)
{
*ptr++ = doc;
in_a_word = TRUE;
}
*doc++ = c;
}
if (err != eofErr && doc == end_doc_buf && in_a_word)
{
for (i = 0; i < KEY_LENGTH; ++i)
{
c = zgetc (doc_file);
if (c == EOF)
{
*doc++ = '\0';
break;
}
c = filter_table[c];
if (c == '\0')
{
*doc++ = '\0';
break;
}
*doc++ = c;
}
if (i == KEY_LENGTH)
while ((c = zgetc (doc_file)) != EOF && filter_table[c])
;
}
return (ptr - ptr0);
}
/* my function to replace getc() ... give up and call it EOF on any error
* at all, at this point! ... note also that we must return the UNSIGNED
* character read in, to handle Mac special characters properly....
*/
int zgetc (refNum)
int refNum;
{
unsigned char readbuf[1];
long count = 1;
if (FSRead (refNum, &count, readbuf) == noErr)
return (readbuf[0]);
else
return (EOF);
}
/* function to copy a string of length up to n from s2 to s1....
*/
char *strncpy (s1, s2, n)
register char *s1, *s2;
register int n;
{
char *s0 = s1;
if (n > 0)
{
while (n-- && (*s1++ = *s2++));
while (n > 0)
{ /* pad out to n chars -- H&S specs it this way... */
*s1++ = '\0';
n--;
}
}
return (s0);
}
/* ------functions to merge key and ptr files-------------------- */
/* function to do the real work of merging sorted k & p
* files into a single sorted file...
*
* Procedure is as follows:
*
* Compare the current key records from each of the N files to be merged.
* Take the smallest of the keys (or, if there is a tie, take the one
* from the earlier file number) and write its pointer records out to
* the *.p file for the next generation. If the key is a new one, that
* is, if it differs from the previous key, write out the previous key
* word and the value for cumulative_counts to the *.k file, and reset
* the previous key's value to that of the current key. Then repeat
* this whole procedure, until all the input files are exhausted.
*
* The above works provided that we are careful in choosing the smallest
* key and never let a file that has been exhausted (reached EOF) win a
* comparison. The function smallest_key does that properly below, since
* a file that is at EOF has returned a NULL pointer for its key_rec...
*
* For each file, the variable prev_cc[i] holds the previous value of
* cumulative_counts for that file, so that we can determine how
* many ptr_recs to write out by doing a simple subtraction.
*
* Buffer numbering scheme: the Kth input file has zbuffer #K
* for its keys and zbuffer #(K+n) for its pointers; the output
* buffers are zbuffers #(2*n) for keys and #(2*n+1) for pointers.
*/
void nway_merge_kpfiles (ink, inp, outk, outp, n, zbufsiz)
int ink[], inp[], outk, outp, n;
long zbufsiz;
{
int i;
KEY_REC *kr[NMERGE], prev_key;
long prev_cc[NMERGE], out_cc;
ZBUF zbuffer[NMERGE * 2 + 2];
for (i = 0; i < n; ++i)
prev_cc[i] = 0;
out_cc = 0;
for (i = 0; i < n; ++i)
{
create_zbuffer (i, zbufsiz, ink[i], sizeof(KEY_REC), zbuffer);
create_zbuffer (i + n, zbufsiz, inp[i], sizeof(long), zbuffer);
}
create_zbuffer (2 * n, zbufsiz, outk, sizeof(KEY_REC), zbuffer);
create_zbuffer (2 * n + 1, zbufsiz, outp, sizeof(long), zbuffer);
for (i = 0; i < n; ++i)
kr[i] = (KEY_REC *)next_input_item (i, zbuffer);
i = smallest_key (kr, n);
strncpy (prev_key.kkey, kr[i]->kkey, KEY_LENGTH);
while (merge_not_finished (kr, n))
{
i = smallest_key (kr, n);
if (zstrcmp (prev_key.kkey, kr[i]->kkey))
{
copy_key_rec (prev_key.kkey, out_cc, 2 * n, zbuffer);
strncpy (prev_key.kkey, kr[i]->kkey, KEY_LENGTH);
}
copy_ptr_recs (i + n, kr[i]->ccount - prev_cc[i], 2 * n + 1, zbuffer);
out_cc += kr[i]->ccount - prev_cc[i];
prev_cc[i] = kr[i]->ccount;
kr[i] = (KEY_REC *)next_input_item (i, zbuffer);
}
copy_key_rec (prev_key.kkey, out_cc, 2 * n, zbuffer);
flush_zbuffer (2 * n, zbuffer);
flush_zbuffer (2 * n + 1, zbuffer);
for (i = 0; i < 2 * n + 2; ++i)
DisposPtr (zbuffer[i].zbufbase);
}
/* function to copy a chosen number of ptr_recs (longs) from one file
* to another ... used in merging two files by merge_kpfiles() above....
*/
void copy_ptr_recs (inzbuf, count, outzbuf, zbuffer)
int inzbuf, outzbuf;
long count;
ZBUF zbuffer[];
{
long *inp, *outp;
for ( ; count > 0; --count)
{
inp = (long *)next_input_item (inzbuf, zbuffer);
outp = (long *)next_output_item (outzbuf, zbuffer);
*outp = *inp;
}
}
/* function to write a key record including kkey (key word) and ccount
* (cumulative_count) to an output file...
*/
void copy_key_rec (kkey, cc, outzbuf, zbuffer)
char *kkey;
long cc;
int outzbuf;
ZBUF zbuffer[];
{
KEY_REC *outp;
outp = (KEY_REC *)next_output_item (outzbuf, zbuffer);
strncpy (outp->kkey, kkey, KEY_LENGTH);
outp->ccount = cc;
}
/* simple function to see if we are not yet finished with all of the input
* files coming in to the merge operation ... signified by at least one of
* the key pointers being non-NULL....
*/
int merge_not_finished (kr, n)
KEY_REC *kr[];
register int n;
{
register int i;
for (i = 0; i < n; ++i)
if (kr[i] != NULL)
return (TRUE);
return (FALSE);
}
/* function to determine the smallest of the n keys pointed to by the
* kr[] vector of pointers to KEY_RECs ... note that a NULL ptr ranks
* after any other...also note that in case of a tie, the smaller
* value of i is the one to return (for a stable merging sort)
*/
smallest_key (kr, n)
KEY_REC *kr[];
register int n;
{
register int i, smallest;
for (i = 0; i < n; ++i)
if (kr[i] != NULL)
{
smallest = i;
break;
}
for (i = smallest + 1; i < n; ++i)
if (kr[i] != NULL)
if (zstrcmp (kr[smallest]->kkey, kr[i]->kkey) > 0)
smallest = i;
return (smallest);
}
/* ---------------function to merge index files------------- */
/* function to merge sorted indices together repeatedly until finished
* with them all in a single set of *.k/*.p files ...
*
* The merging strategy is straightforward enough:
* Let "g" denote the generation_number and "f" denote the file_number.
* Temporary file names begin with the letter z, which is then followed
* by a generation number (decimal), the letter k or p (standing for
* key or ptr, respectively), and then the file number (decimal). Thus,
* the file "z0k0" is the keys file #0 for generation #0 (the starting,
* pre-merging, generation), file "z2p3" is the ptr file #3 for generation
* #2, etc.
*
* (The following discussion is specifically for a 2-way merge ... but
* the generalization for N-way merging is straightforward.)
*
* On a given call to merge_indices, the following may happen:
* - files zgkf/zgpf and zgk(f+1)/zgp(f+1) are merged into files
* z(g+1)k(f/2)/z(g+1)p(f/2), and then the parent files are
* deleted;
* - file zgkf isn't found, which means we are done with this
* generation and must go on to the next;
* - file zgk(f+1) isn't found, which means that either we are
* completely done with the merging work (if f=0) and just
* have to rename the files zgkf/zgpf into the correct final
* names (that is, doc_filename.k/doc_filename.p), or else
* (if f>0) we have an odd number of files for this level
* of merging, and therefore just have to rename zgkf/zgpf
* to z(g+1)k(f/2)/z(g+1)p(f/2).
*
* to work as a nice XFCN, this function returns the upcoming
* generation_number, and lets the caller reset the file_number to zero
* when the generation_number returned differs from the input number, or
* increment the file_number by NMERGE when still on same generation.
* Also, when all finished, we return an illegal (negative) generation_number
* value....
*/
int merge_indices (zbufsiz, doc_file, vRef0, file_number,
generation_number, doc_filename)
long zbufsiz;
int doc_file, vRef0, file_number, generation_number;
Str255 doc_filename;
{
int ink[NMERGE], inp[NMERGE], outk, outp;
long inwords, indistinctwords, outdistinctwords;
int i, n;
char info[128];
for (n = 0; n < NMERGE; ++n)
{
ink[n] = open_inkfile (file_number + n, vRef0, generation_number);
if (ink[n] == NULL)
break;
inp[n] = open_inpfile (file_number + n, vRef0, generation_number);
}
if (file_number + n == 1)
{
close_infiles (ink, inp, n);
fix_final_file_names (doc_filename, vRef0, generation_number);
return (-1);
}
if (n < 2)
{
if (n == 1)
{
close_infiles (ink, inp, n);
fix_oddball_file_name (vRef0, generation_number, file_number);
}
strncpy (info + 1, "Beginning merge generation #", 28);
i = 29 + putNum (info + 29, 2 + generation_number);
info[0] = i - 1;
give_msg (info);
return (generation_number + 1);
}
outk = open_outkfile (vRef0, generation_number, file_number);
outp = open_outpfile (vRef0, generation_number, file_number);
inwords = 0;
indistinctwords = 0;
for (i = 0; i < n; ++i)
{
inwords += file_size (inp[i]) / sizeof(long);
indistinctwords += file_size (ink[i]) / sizeof(KEY_REC);
}
nway_merge_kpfiles (ink, inp, outk, outp, n, zbufsiz);
outdistinctwords = file_size (outk) / sizeof(KEY_REC);
strncpy (info + 1, "Merge #", 7);
i = 8 + putNum (info + 8, 1 + file_number / NMERGE);
strncpy (info + i, ": ", 3);
i += 3;
i += putNum (info + i, inwords);
strncpy (info + i, " total words, ", 14);
i += 14;
i += putNum (info + i, indistinctwords);
strncpy (info + i, " distinct words in, ", 20);
i += 20;
i += putNum (info + i, outdistinctwords);
strncpy (info + i, " out.", 5);
i += 5;
info[0] = i - 1;
give_msg (info);
close_infiles (ink, inp, n);
FSClose (outk);
FSClose (outp);
remove_used_infiles (n, vRef0, generation_number, file_number);
return (generation_number);
}
/* ------------file utility functions---------------------------- */
/* function to write out sorted k & p files based on the doc and ptr
* arrays in memory....
*
* The kfile format is as described in detail elsewhere:
* the key word, turned into all capital letters and with spaces
* afterward, of fixed length KEY_LENGTH; and
* the cumulative count of how many words have passed before, including
* the current word, a long integer.
*/
long write_sorted_files (doc, ptr, nwords, offset, zbufsiz, pass_number,
vRef0)
char *doc, **ptr;
long nwords, offset, zbufsiz;
int pass_number, vRef0;
{
int kfile, pfile;
char *prev_word;
KEY_REC *outk;
long *outp, i;
ZBUF zbuffer[2];
if (nwords == 0)
return;
kfile = open_kfile (vRef0, pass_number);
pfile = open_pfile (vRef0, pass_number);
create_zbuffer (0, zbufsiz, kfile, sizeof(KEY_REC), zbuffer);
create_zbuffer (1, zbufsiz, pfile, sizeof(long), zbuffer);
prev_word = ptr[0];
outk = (KEY_REC *)next_output_item (0, zbuffer);
write_new_key (ptr[0], outk->kkey);
for (i = 0; i < nwords; ++i)
{
if (is_new_word (prev_word, ptr[i]))
{
outk->ccount = i;
outk = (KEY_REC *)next_output_item (0, zbuffer);
write_new_key (ptr[i], outk->kkey);
prev_word = ptr[i];
}
outp = (long *)next_output_item (1, zbuffer);
*outp = (ptr[i] - doc) + offset;
}
outk->ccount = i;
flush_zbuffer (0, zbuffer);
flush_zbuffer (1, zbuffer);
DisposPtr (zbuffer[0].zbufbase);
DisposPtr (zbuffer[1].zbufbase);
i = file_size (kfile) / sizeof(KEY_REC);
FSClose (kfile);
FSClose (pfile);
return (i);
}
/* function to determine if the current word is the same as or different
* from the previous word -- if it is different, we'll need to write an
* entry out to the key file kfile -- compare the words up to the first
* '\0', or for a maximum distance of KEY_LENGTH, and return TRUE
* if they differ, FALSE if they are identical that far. Thus, a simple
* call to zstrcmp() does the job.... but keep ours as a function instead
* of a macro call for the moment, for safety and readability....
*/
int is_new_word (w0, w1)
char *w0, *w1;
{
return (zstrcmp (w0, w1));
}
/* function to write out a new key entry in the key_file:
* KEY_LENGTH letters consisting of the key word (which will be found
* delimited by a '\0'), followed by enough blanks to fill out the
* record to total length KEY_LENGTH ...
*/
void write_new_key (p, kp)
register char *p, *kp;
{
register int i, c;
for (i = 0; i < KEY_LENGTH; ++i)
{
c = *p++;
if (c == '\0')
break;
*kp++ = c;
}
for ( ; i < KEY_LENGTH; ++i)
*kp++ = ' ';
return;
}
/* a very simple function to return the size of a file ... doesn't do any
* error-checking -- sorry!
*/
long file_size (refnum)
int refnum;
{
long result;
GetEOF (refnum, &result);
return (result);
}
/* ---------------miscellaneous useful functions----------------- */
/* function to open an input key file for this generation and file number ...
* note the limitation to no more than 9 generations, which with a 4-way
* merge should be more than enough for a few years to come! Note that
* this function should return 0 and not give up if it attempts to open
* a nonexistent file -- that's the signal for merge_indices() to know
* that all of the input files have been used up....
*/
int open_inkfile (file_num, vRef0, generation_number)
int file_num, vRef0, generation_number;
{
Str255 fname;
int i;
fname[1] = 'z';
fname[2] = generation_number + '0';
fname[3] = 'k';
i = putNum ((char *)fname + 4, file_num);
fname[0] = i + 3;
if (FSOpen (fname, vRef0, &i) != noErr)
return (NULL);
return (i);
}
/* function to open an input ptr file for this generation and file number
*/
int open_inpfile (file_num, vRef0, generation_number)
int file_num, vRef0, generation_number;
{
Str255 fname;
int i;
fname[1] = 'z';
fname[2] = generation_number + '0';
fname[3] = 'p';
i = putNum ((char *)fname + 4, file_num);
fname[0] = i + 3;
if (FSOpen (fname, vRef0, &i) != noErr)
return (NULL);
return (i);
}
/* function to open an output key file for this generation and file number...
* use my Official Apple-Approved File Type 'CTLZ' (for control-Z = ^Z)...
*/
int open_outkfile (vRef0, generation_number, file_number)
int vRef0, generation_number, file_number;
{
Str255 fname;
int i;
fname[1] = 'z';
fname[2] = generation_number + 1 + '0';
fname[3] = 'k';
i = putNum ((char *)fname + 4, file_number / NMERGE);
fname[0] = i + 3;
if (Create (fname, vRef0, '????', 'CTLZ') != noErr)
{
give_msg ("\pSorry, fatal error creating merge key file -- click mouse to exit, then reboot!");
beepWait ();
RestoreA4();
ExitToShell ();
}
if (FSOpen (fname, vRef0, &i) != noErr)
{
give_msg ("\pSorry, fatal error opening merge key file -- click mouse to exit, then reboot!");
beepWait ();
RestoreA4();
ExitToShell ();
}
return (i);
}
/* function to open an output ptr file for this generation and file number
* again, use 'CTLZ' file type...
*/
int open_outpfile (vRef0, generation_number, file_number)
int vRef0, generation_number, file_number;
{
Str255 fname;
int i;
fname[1] = 'z';
fname[2] = generation_number + 1 + '0';
fname[3] = 'p';
i = putNum ((char *)fname + 4, file_number / NMERGE);
fname[0] = i + 3;
if (Create (fname, vRef0, '????', 'CTLZ') != noErr)
{
give_msg ("\pSorry, fatal error creating merge ptr file -- click mouse to exit, then reboot!");
beepWait ();
RestoreA4();
ExitToShell ();
}
if (FSOpen (fname, vRef0, &i) != noErr)
{
give_msg ("\pSorry, fatal error opening merge ptr file -- click mouse to exit, then reboot!");
beepWait ();
RestoreA4();
ExitToShell ();
}
return (i);
}
/* function to rename the remaining last unpaired key file & ptr file
* from one generation to the next...
*/
void fix_oddball_file_name (vRef0, generation_number, file_number)
int vRef0, generation_number, file_number;
{
Str255 oldname, newname;
int i;
oldname[1] = 'z';
oldname[2] = generation_number + '0';
oldname[3] = 'k';
i = putNum ((char *)oldname + 4, file_number);
oldname[0] = i + 3;
newname[1] = 'z';
newname[2] = generation_number + 1 + '0';
newname[3] = 'k';
i = putNum ((char *)newname + 4, file_number / NMERGE);
newname[0] = i + 3;
if (Rename (oldname, vRef0, newname) != noErr)
{
give_msg ("\pSorry, error renaming merge key file ... click mouse to continue");
beepWait ();
}
oldname[1] = 'z';
oldname[2] = generation_number + '0';
oldname[3] = 'p';
i = putNum ((char *)oldname + 4, file_number);
oldname[0] = i + 3;
newname[1] = 'z';
newname[2] = generation_number + 1 + '0';
newname[3] = 'p';
i = putNum ((char *)newname + 4, file_number / NMERGE);
newname[0] = i + 3;
if (Rename (oldname, vRef0, newname) != noErr)
{
give_msg ("\pSorry, error renaming merge ptr file ... click mouse to continue");
beepWait ();
}
return;
}
/* function to give the final key and ptr files their proper ultimate
* names ... ok for it to mess with doc_filename string....
*/
void fix_final_file_names (doc_filename, vRef0, generation_number)
Str255 doc_filename;
int vRef0, generation_number;
{
Str255 oldname;
oldname[0] = 4;
oldname[1] = 'z';
oldname[2] = generation_number + '0';
oldname[3] = 'k';
oldname[4] = '0';
doc_filename[++doc_filename[0]] = '.';
doc_filename[++doc_filename[0]] = 'k';
if (Rename (oldname, vRef0, doc_filename) != noErr)
{
give_msg ("\pSorry, error renaming final key file ... click mouse to continue");
beepWait ();
}
oldname[3] = 'p';
doc_filename[doc_filename[0]] = 'p';
if (Rename (oldname, vRef0, doc_filename) != noErr)
{
give_msg ("\pSorry, error renaming final ptr file ... click mouse to continue");
beepWait ();
}
return;
}
/* function to get rid of the superfluous k & p files now that they
* have been merged into the next generation....
*/
void remove_used_infiles (n, vRef0, generation_number, file_number)
int n, vRef0, generation_number, file_number;
{
Str255 fname;
int i, j;
fname[1] = 'z';
fname[2] = generation_number + '0';
for (i = 0; i < n; ++i)
{
fname[3] = 'k';
j = putNum ((char *)fname + 4, file_number + i);
fname[0] = j + 3;
if (FSDelete (fname, vRef0) != noErr)
{
give_msg ("\pSorry, error deleting merge key file ... click mouse to continue");
beepWait ();
}
fname[3] = 'p';
if (FSDelete (fname, vRef0) != noErr)
{
give_msg ("\pSorry, error deleting merge ptr file ... click mouse to continue");
beepWait ();
}
}
return;
}
/* function to close out the ink and inp files that have been opened...
*/
void close_infiles (ink, inp, n)
int ink[], inp[];
int n;
{
int i;
for (i = 0; i < n; ++i)
{
FSClose (ink[i]);
FSClose (inp[i]);
}
}
/* function to open the key file with proper name for this pass ... file will be
* named "z0kN" where N represents the pass number .... type 'CTLZ'
*/
int open_kfile (vRef0, pass_number)
int vRef0, pass_number;
{
Str255 fname;
int i;
fname[1] = 'z';
fname[2] = '0';
fname[3] = 'k';
i = putNum ((char *)fname + 4, pass_number);
fname[0] = i + 3;
if (Create (fname, vRef0, '????', 'CTLZ') != noErr)
{
give_msg ("\pSorry, fatal error creating key file -- click mouse to exit, then reboot!");
beepWait ();
RestoreA4();
ExitToShell ();
}
if (FSOpen (fname, vRef0, &i) != noErr)
{
give_msg ("\pSorry, fatal error opening key file -- click mouse to exit, then reboot!");
beepWait ();
RestoreA4();
ExitToShell ();
}
return (i);
}
/* function to open the ptr file with proper name for this pass ... file will be
* named "z0pN" where N represents the pass number .... type 'CTLZ'
*/
int open_pfile (vRef0, pass_number)
int vRef0, pass_number;
{
Str255 fname;
int i;
fname[1] = 'z';
fname[2] = '0';
fname[3] = 'p';
i = putNum ((char *)fname + 4, pass_number);
fname[0] = i + 3;
if (Create (&fname, vRef0, '????', 'CTLZ') != noErr)
{
give_msg ("\pSorry, fatal error creating ptr file -- click mouse to exit, then reboot!");
beepWait ();
RestoreA4();
ExitToShell ();
}
if (FSOpen (&fname, vRef0, &i) != noErr)
{
give_msg ("\pSorry, fatal error opening ptr file -- click mouse to exit, then reboot!");
beepWait ();
RestoreA4();
ExitToShell ();
}
return (i);
}
/* function to take the place of stdio's ftell()
*/
long zftell (refnum)
int refnum;
{
long ans;
GetFPos (refnum, &ans);
return (ans);
}
/* --------------function to do quicksort -------------------------- */
/* sort elements "first" through "last" */
void zqsort (first, last)
register char **first, **last;
{
register char **i, **j, *tmp;
while (last - first > 1)
{
i = first;
j = last;
for (;;)
{
while (++i < last && compare_ptrs (i, first) < 0)
;
while (--j > first && compare_ptrs (j, first) > 0)
;
if (i >= j)
break;
tmp = *i;
*i = *j;
*j = tmp;
}
tmp = *first;
*first = *j;
*j = tmp;
if (j - first < last - (j + 1))
{
zqsort (first, j);
first = j + 1;
}
else
{
zqsort (j + 1, last);
last = j;
}
}
}
/* function to compare two index ptrs and give a result appropriate
* for quicksort to use in alphabetizing them....
*
* Since the words pointed to have already been turned into all capital
* letters and delimiters have been filtered out, simply doing zstrcmp()
* for KEY_LENGTH letters works fine!
*
* Slight modification to make the quicksort stable: if two words tie,
* then we want to compare their pointers to make the lesser one come
* out first in the sort ...
*/
int compare_ptrs (p1, p2)
register char **p1, **p2;
{
register int diff;
diff = zstrcmp (*p1, *p2);
if (diff == 0)
diff = ((*p1 - *p2) > 0) ? 1 : -1;
return (diff);
}
/* my function to compare two strings and give a result as to who is
* alphabetically earlier. Note that this is almost the same as strncmp()
* with the fixed value of KEY_LENGTH as the maximum comparison distance,
* except that I must be sure to mask the characters to make them positive
* (since we want to be able to handle the non-ASCII funny letters in
* the Apple character set properly/consistently). If the masking isn't
* done, then inconsistent results can occur with those non-ASCII chars!
*/
int zstrcmp (s1, s2)
register char *s1, *s2;
{
register int n = KEY_LENGTH;
for (; --n && ((*s1 & 0xFF) == (*s2 & 0xFF)); s1++, s2++)
if (!*s1) break;
return ((*s1 & 0xFF) - (*s2 & 0xFF));
}